{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from mpl_toolkits.mplot3d import Axes3D\n",
    "import warnings\n",
    "warnings.filterwarnings(action = 'ignore')\n",
    "%matplotlib inline\n",
    "plt.rcParams['font.sans-serif']=['SimHei']  #解决中文显示乱码问题\n",
    "plt.rcParams['axes.unicode_minus']=False\n",
    "import sklearn.linear_model as LM\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.model_selection import cross_val_score,train_test_split\n",
    "from sklearn.datasets import make_regression\n",
    "from sklearn import tree\n",
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "良       827\n",
      "轻度污染    470\n",
      "优       377\n",
      "中度污染    252\n",
      "重度污染    127\n",
      "严重污染     43\n",
      "Name: 质量等级, dtype: int64\n",
      "输入变量:\n",
      " Index(['PM2.5', 'PM10', 'SO2', 'CO', 'NO2'], dtype='object')\n",
      "|--- feature_0 <= 75.50\n",
      "|   |--- feature_1 <= 50.50\n",
      "|   |   |--- class: 1\n",
      "|   |--- feature_1 >  50.50\n",
      "|   |   |--- class: 2\n",
      "|--- feature_0 >  75.50\n",
      "|   |--- feature_0 <= 115.50\n",
      "|   |   |--- class: 3\n",
      "|   |--- feature_0 >  115.50\n",
      "|   |   |--- class: 4\n",
      "\n",
      "训练精度:0.658874\n"
     ]
    }
   ],
   "source": [
    "data=pd.read_excel('北京市空气质量数据.xlsx')\n",
    "data=data.replace(0,np.NaN)\n",
    "data=data.dropna()\n",
    "X_train=data.iloc[:,3:-1]\n",
    "y_train=data['质量等级']\n",
    "print(y_train.value_counts())\n",
    "print(\"输入变量:\\n\",X_train.columns)\n",
    "y_train=y_train.map({'优':'1','良':'2','轻度污染':'3','中度污染':'4','重度污染':'5','严重污染':'6'})\n",
    "modelDTC = tree.DecisionTreeClassifier(max_depth=2,random_state=123)\n",
    "modelDTC.fit(X_train, y_train)\n",
    "print(tree.export_text(modelDTC))\n",
    "#print(tree.plot_tree(modelDTC))\n",
    "print(\"训练精度:%f\"%(modelDTC.score(X_train,y_train)))\n",
    "\n",
    "with open(\"D:\\jueceshu.dot\", 'w') as f:\n",
    "    f = tree.export_graphviz(modelDTC, out_file = f,filled=True,class_names=True,proportion=True,rounded=True)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "说明：（1）第1至7行：读入空气质量监测数据。进行数据预处理。\n",
    "（2）第9，10行：建立树深度等于2的分类树模型，并拟合数据。\n",
    "（3）第11行：输出分类树的文本化表达结果。\n",
    "文本化表达是以字符形式展示分类树的构成，树根在左，树叶在右。规则集包含4推理规则。\n",
    "（4）第13行：计算分类树的训练精度。因树深度较浅，预测效果不理想。\n",
    "（5）第15，16行：将分类树的图形结果保存到指定文件中。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "训练精度:0.880725\n"
     ]
    }
   ],
   "source": [
    "modelDTC = tree.DecisionTreeClassifier(max_depth=10,random_state=123)\n",
    "modelDTC.fit(X_train, y_train)\n",
    "print(\"训练精度:%f\"%(modelDTC.score(X_train,y_train)))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
